data_folder = "data"
MAPBOX_ACCESS_TOKEN = 'pk.eyJ1IjoiYnpkZWNvIiwiYSI6ImNrNDVmNTU5ZTA1dnIzZXJ2Zm9vbmZsd2EifQ.KvNq66ornrV9ACa98F0s9w'
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import json
import shapely
from shapely.geometry import shape, Point
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
Our wrapper funcitons for plotting maps using plotly:
import copy
import os
import re
import json
import pandas as pd
import numpy as np
import plotly.graph_objects as go
def insert_variables_as_properties_in_geodata(geodata, dataframe, key_property, key_column, variable_names):
output_geodata = copy.deepcopy(geodata)
for feature in output_geodata['features']:
key = feature['properties'][key_property]
row = dataframe[dataframe[key_column] == key]
if not row.empty:
variables_values = row[variable_names].values[0]
for name, value in zip(variable_names, variables_values):
feature['properties'][name] = str(value)
return output_geodata
def format_template(template, variable_names):
replaced = copy.deepcopy(template)
for name in variable_names:
replaced = re.sub(r'\{\}', '%{properties.' + name + '}', replaced, count=1)
return replaced
def load_geodata(geodata_path, key_property):
with open(os.path.join(geodata_path)) as geojson_file:
geodata = json.loads(geojson_file.read())
# add id feature for choroplethmapbox
for feature in geodata['features']:
feature['id'] = feature['properties'][key_property]
return geodata
def plot_map(title, geodata_path, dataframe, key_property, key_column, value_column, template, mapbox_access_token, options):
'''
title: plot title
geodata_path: path to the geojson file
dataframe: Dataframe with columns:
- key (column name specified in key_column),
- value plotted as color of the neighborhood (column name specified in value_column)
- any other values in additional columnss will be used as an variables interpolated into the template
key_property: value distinguishing each area in the geojson file stored in properties section
key_column: name of the column in the dataframe containing key values (distinguishing areas)
value_column: name of the column in the dataframe containig values for filling neighborhoods with colors
template: text that will be displayed in a box over the hovered neighborhood. Any variable in the template
should be denoted by "{}". Variables will be taken from the remaining columns of the dataframe in the order
in which the columns appear.
template: a string in a python string format (with {} in places where to put variables)
mapbox_access_token: token needed for using mapbox
options: options styling the plot
'''
# unpack options
colorscale = options['colorscale'] if 'colorscale' in options else 'Viridis'
colorbar = options['colorbar'] if 'colorbar' in options else {}
showscale = options['showscale'] if 'showscale' in options else True
# load geodata
geodata = load_geodata(geodata_path, key_property)
# prepare data
columns = dataframe.columns.values
dropped_columns = set([key_column, value_column])
variable_names = [column for column in columns if column not in dropped_columns]
extended_geodata = insert_variables_as_properties_in_geodata(geodata, dataframe, key_property, key_column, variable_names)
formatted_template = format_template(template, variable_names) + '<extra></extra>'
# plot
figure = go.Figure(go.Choroplethmapbox(
geojson=extended_geodata,
locations=dataframe[key_column],
z=dataframe[value_column],
hovertemplate=formatted_template,
showscale=showscale,
colorscale=colorscale,
colorbar=colorbar
))
figure.update_layout(
title=title,
font={'family': 'Arial Black'},
mapbox_accesstoken=mapbox_access_token,
mapbox_zoom=9,
mapbox_pitch=0,
mapbox_bearing=0,
mapbox_center={'lat': 41.86, 'lon': -87.63},
margin={'r': 0, 'l': 50, 't': 50, 'b': 0}
)
return figure
class MapLayer():
def __init__(self, name, dataframe, key_column, value_column, options):
self.name = name
self.dataframe = dataframe
self.key_column = key_column
self.value_column = value_column
self.options = options
# Implementation based on: https://plot.ly/~empet/15237/choroplethmapbox-with-dropdown-menu/#/
def plot_maps(title, geodata_path, key_property, layers, template, mapbox_access_token, options):
# load geodata
geodata = load_geodata(geodata_path, key_property)
data = []
for layer in layers:
# extract variables
dataframe = layer.dataframe
key_column = layer.key_column
value_column = layer.value_column
# unpack options
colorscale = layer.options['colorscale'] if 'colorscale' in options else 'Viridis'
colorbar = layer.options['colorbar'] if 'colorbar' in options else {}
showscale = layer.options['showscale'] if 'showscale' in options else True
# prepare data
columns = dataframe.columns.values
dropped_columns = set([key_column, value_column])
variable_names = [column for column in columns if column not in dropped_columns]
extended_geodata = insert_variables_as_properties_in_geodata(geodata, dataframe, key_property, key_column, variable_names)
formatted_template = format_template(template, variable_names) + '<extra></extra>'
# plot
figure = go.Figure()
data.append(go.Choroplethmapbox(
geojson=extended_geodata,
locations=dataframe[key_column],
z=dataframe[value_column],
hovertemplate=formatted_template,
showscale=showscale,
colorscale=colorscale,
colorbar=colorbar,
visible=False
))
# mark first layer as visible
data[0]['visible'] = True
# create layout
layout = go.Layout(
title=title,
font={'family': 'Arial Black'},
mapbox_accesstoken=mapbox_access_token,
mapbox_zoom=9,
mapbox_pitch=0,
mapbox_bearing=0,
mapbox_center={'lat': 41.86, 'lon': -87.63},
margin={'r': 0, 'l': 50, 't': 50, 'b': 0}
)
# create layers visibility lists
visibilities = []
template_visibility = np.zeros((len(layers),)).astype(bool)
for i in range(len(layers)):
visibility = list(np.copy(template_visibility))
visibility[i] = True
visibilities.append(visibility)
# create menu buttons
buttons = []
for i, layer in enumerate(layers):
buttons.append({
'args': ['visible', visibilities[i]],
'label': layer.name,
'method': 'restyle'
})
# update layout adding menu
layout.update(
updatemenus=[{
'x': 0.95,
'y': 0.95,
'yanchor': 'top',
'buttons': buttons
}]
)
figure = go.Figure(data=data, layout=layout)
return figure
# Function for saving plots as html and loading them
from plotly.offline import plot
from IPython.display import IFrame
def plot_offline(fig,filename):
#filename = "inspections_per_year.html"
html = plot(fig, auto_open=False, output_type='div')
with open(filename, 'w') as file:
file.write(html)
pyo.plot(fig, filename=filename,validate=False)
return IFrame(src=filename, width=1000, height=600)
We start by reading our data
inspections = pd.read_csv(os.path.join(data_folder, "food-inspections.csv"))
We can now look into the fetures and types of our dataset through a sample :
inspections.head()
inspections.columns
First thing we notice is the strange presence of the columns Historical Wards 2003-2015, Zip Codes, Community Areas, Census Tracts and Wards. These columns all have only NaN values, so we are dropping them:
inspections.drop(columns=['Historical Wards 2003-2015', 'Zip Codes',
'Community Areas', 'Census Tracts', 'Wards'],inplace=True)
Let's see what are the possible categories of our facilities:
inspections["Facility Type"].unique()
We can notice very different facility types; as we want to analyze only restaurants we will keep only entries where "Restaurant" is the Facility Type. Let's see if we have any entries with missing Facility Type:
print("There are {0} missing values for Facility Type in the dataset".format(inspections['Facility Type'].isna().sum()))
For now we will get rid of those rows as they are of no use to us:
inspections = inspections.dropna(subset=["Facility Type"])
We only want to analyze entries which correspond to restaurants so we will keep only those rows
# Keep only restaurants for the rest of the analysis
inspections = inspections[inspections["Facility Type"].str.contains("restaurant", case=False)]
print(f"We have {len(inspections)} rows in our dataset, each row corresponds to one inspection")
Since we are going to compare inspection's results over time, let's look at the time range we have for the inspections.
# Cast Inspection Date to Datetime type
inspections['Inspection Date'] = pd.to_datetime(inspections['Inspection Date'])
print('The first inspection in the dataset happened in {0}, and the last one happened was on {1}'.
format(inspections['Inspection Date'].min().strftime("%d.%m.%Y"), inspections['Inspection Date'].max().strftime("%d.%m.%Y.")))
We will now proceed with preparing the dataset for the analysis. Below we take the following preprocessing steps:
The original dataset we have is inspections-oriented, but our analysis will be mainly focused on restaurants. That is why we need a feature which will help us uniquely identify each restaurant.
print(f"Using DBA Name we can see there are {inspections['DBA Name'].nunique()} distinct restaurant names")
print(f"Using restaurant Licenses we can see there are {inspections['License #'].nunique()} distinct restaurants")
We have probably many non unique names due to restaurants such as McDonalds and other large food-serving chains. We want to make the license number our primary key for restaurant identification as we assume different restaurants have different license numbers. First, let's check if there are any missing values in this column:
print(f"There are {inspections['License #'].isna().sum()} restaurant that have NaN licence number")
We noticed however that there are some restaurant with licence numbers equal to 0, let's see how many there are:
print(f"There are {len(inspections[inspections['License #'] == 0.0])} restaurant that have zero licence number")
As we can see, there is no column that can help us uniquely identify a restaurant. The closest match is License # feature, which has some restaurants with 0 assigned as their license number. We decided to add fake licence numbers and use it as a restaurant identifier later on.
# Get entries where License number is missing
zero_license = inspections[inspections["License #"] == 0.0]
zero_license.nunique()
To be able to identify restaurants with license number equal to 0, we will use restaurant location and name. Using this two attributes, we can create artificial license numbers for restaurants with missing license numbers:
# Get the maximum license number that exists in the database and use it as starting point for the newly generated license numbers
start_id = int(inspections["License #"].max() + 1)
missing_licenses = inspections[(inspections["License #"] == 0)][["DBA Name","Location"]].copy().drop_duplicates()
missing_licenses["new License"] = [i for i in range(start_id, start_id+len(missing_licenses.drop_duplicates()))]
# Merge dataset
inspections = inspections.merge(missing_licenses, on=["DBA Name","Location"], how='left')
# Populate missing licence numbers with newly generated license numbers
inspections["License #"] = inspections["License #"].apply(lambda x: np.nan if x == 0 else x)
inspections["License #"] = inspections["License #"].fillna(value = inspections["new License"])
inspections.drop(columns=["new License"], inplace=True)
print("There are {0} missing license numbers in our dataset".format(inspections[inspections["License #"] == 0].size))
# Cast License number from float to int
inspections["License #"] = inspections["License #"].apply(lambda x : int(x))
We want to populate the missing Zip values based on the Latitude and Longitude. In order to do that, we must have those two features for all the restaurants in the dataset. That is why we decided to drop all entries where one of those features is missing.
inspections = inspections[~((inspections['Longitude'].isna()) | (inspections['Latitude'].isna()))]
print('There are {0} missing values for Zip column'.format(inspections[inspections['Zip'].isna()].shape[0]))
print('There are {0} missing values for City column'.format(inspections[inspections['City'].isna()].shape[0]))
restaurants_zip_na = inspections[inspections['Zip'].isna()]
To populate Zip values based on coordinates, we use shapely - Python package for manipulation and analysis of planar geometric objects.
# Function that creates points from Latitude and Longitude
def create_points(df):
coords = list(zip(df['Longitude'], df['Latitude']))
res = []
for coord in coords:
res.append(shapely.geometry.Point(coord))
return res
# Create list of points for which we want to get Zip code
points = create_points(restaurants_zip_na)
# Method which checks whether the points are in area described in geojson file and returns data with zip value for found points
def populate_missing_zip(points, geojson_filename):
# load GeoJSON file containing sectors
state_geo_path = r'{0}'.format(geojson_filename)
geo_json_data = json.load(open(state_geo_path))
zip_found = []
# check each polygon to see if it contains the point
for feature in geo_json_data['features']:
polygon = shapely.geometry.shape(feature['geometry'])
for point in points:
if polygon.contains(point):
point_complete = {'Longitude':point.x, 'Latitude':point.y, 'Zip':feature.get('properties', {}).get('zip')}
zip_found.append(point_complete)
return zip_found
# Find missing Zip values
zip_found = populate_missing_zip(points, os.path.join(data_folder, 'chicago-zip.json'))
print('Total {0} point found matching Chicago sectors.'.format(len(zip_found)))
zip_found = pd.DataFrame(zip_found)
zip_found.head()
Now, we have to merge those results with the original dataset.
# Before merging, drop duplicate points
zip_found.drop_duplicates(inplace=True)
inspections = inspections.merge(zip_found,on=['Latitude','Longitude'], how='left',suffixes=('', '_notnull'))
inspections.Zip.fillna(value=inspections.Zip_notnull, inplace=True)
inspections.drop(columns=["Zip_notnull"], inplace=True)
print('There are {0} missing Zip left in the restaurant dataset.'.format(inspections[inspections.Zip.isnull()].shape[0]))
# Change type of Zip feature from float to string
inspections['Zip']=inspections['Zip'].apply(lambda x: str(int(x)))
We will also fix the City column. Now we can use the Zip column to fill in the missing information about the city.
# Function that returns all Chicago Zips frem geojson file
def create_chicago_zip_list():
state_geo_path = os.path.join(data_folder, "chicago-zip.json")
geo_json_data = json.load(open(state_geo_path))
zips = []
for feature in geo_json_data['features']:
zips.append(str(feature.get('properties', {}).get('zip')))
return set(zips)
# Get list of all Zip codes in Chicago
chicago_zip = create_chicago_zip_list()
# Check if there is any restaurant not in Chicago
not_in_chicago = len(inspections[inspections.City.isna() & (~inspections.Zip.isin(chicago_zip))])
print('There are {0} Zip values which are not in Chicago.'.format(not_in_chicago))
# Replace all City missing values with Chicago
inspections.City.fillna(value='Chicago', inplace=True)
print('There are {0} missing City values left in the restaurant dataset.'.format(inspections[inspections.City.isnull()].shape[0]))
inspections.City.unique()
# Change all values for City column to be Chicago
inspections.City = 'Chicago'
inspections.City.unique()
inspections['Inspection Type'].unique()
Looking at Inspection Type values we can see that they need some cleaning. According to the document describing the dataset we should have following inspection types:
Also, as the linked document states, re-inspections can be done for most of the types and are indicated in the name of inspection type.
First, we replace NaN values in Inspection Type with "Unknown":
inspections.fillna(value={'Inspection Type': 'Unknown'}, inplace=True)
Now, since we may want to use the indication whether a particular inspection was a re-inspection or not, we add a separate column that will indicate that:
reinspection_pattern = 're-inspec|reinspec|re inspec'
inspections['Re-inspection'] = inspections['Inspection Type'].str.lower().str.contains(reinspection_pattern, regex=True)
Now, we proceed with making the names for given types of inspections uniform:
inspection_types = inspections['Inspection Type'].unique().astype(str)
inspection_types_lower = np.char.lower(inspection_types)
# Replaces values in Inspection Type for records with keywords found in them with the specified replacement value
def standardize_by_finding_keyword(keywords, replacement):
to_replace = np.array([])
for keyword in keywords:
to_replace = np.append(to_replace, inspection_types[np.char.find(inspection_types_lower, keyword) != -1])
inspections['Inspection Type'] = inspections['Inspection Type'].replace(to_replace, value=replacement)
standardize_by_finding_keyword(['canvas'], 'Canvass')
standardize_by_finding_keyword(['complain'], 'Complaint')
standardize_by_finding_keyword(['license'], 'License')
standardize_by_finding_keyword(['task', 'liquor'], 'Task Force')
# Suspected Food Poisoning replacements
sfp_values = inspections['Inspection Type'].str.lower().str.contains('food|sfp', regex=True)
inspections.loc[sfp_values, 'Inspection Type'] = 'Suspected Food Poisoning'
standardize_by_finding_keyword(['sick'], 'Suspected Food Poisoning')
inspections.groupby(by='Inspection Type')['Inspection ID'].count()
There are still a lot of values that appear only once in the entire dataset. There are also ones that could be merged into single categories (e.g. "no entry", "out of business", "recent inspection"). We also decide to leave the categories with significant amount of records, such as "Consultation". The ones we decide to drop will be reclassified under "Unknown" category.
# Replaces records with keyword found in category name to be classified in a given target category
def merge_categories(keyword, target_category):
categories_containing_keyword = inspections['Inspection Type'].str.lower().str.contains(keyword)
inspections.loc[categories_containing_keyword, 'Inspection Type'] = target_category
merge_categories('recent inspection', 'Recent Inspection')
merge_categories('out of business', 'Out of Business')
merge_categories('no entry', 'No Entry')
known_list = ['License', 'Canvass', 'Complaint', 'Consultation', 'No Entry', 'Out of Business', 'Recent Inspection', 'Suspected Food Poisoning', 'Tag Removal', 'Task Force']
# Classify the rest as unknown
inspections.loc[~inspections['Inspection Type'].isin(known_list), 'Inspection Type'] = 'Unknown'
inspections.groupby(by='Inspection Type')['Inspection ID'].count()
Additionally, let's propagate "Out of Business" indication to "Results" column where it should be indicated:
out_of_business = inspections[inspections['Inspection Type'] == 'Out of Business'].index
inspections.loc[out_of_business, 'Results'] = 'Out of Business'
For the research questions we want to explore, we needed to create following additional columns in the dataset:
Chicago is divided into 77 Community Areas that can also be grouped into 9 Districts. Even though we have Zip codes for each restaurant, the role of Zip codes in real world is mainly related to post office and delivery services. People who live in Chicago and visit Chicago are, however, using Community Areas and districts for orientation.
We decided to enrich the dataset with this information. We are using the chicago-community-areas geojson file for getting boundaries of community areas, and shapely library to check for each coordinate in which community area it belongs (same approach that we used for populating missing Zip codes).
# Extract unique Latitude and Longitude from inspections
unique_coords = inspections.copy()[['Latitude','Longitude']].drop_duplicates()
# Create shapely points
points = create_points(unique_coords)
# Method which checks whether the points are in area described in geojson file and returns data with community area name for found points
def populate_area(points, geojson_filename):
# load GeoJSON file containing sectors
state_geo_path = r'{0}'.format(geojson_filename)
geo_json_data = json.load(open(state_geo_path))
area_found = []
# check each polygon to see if it contains the point
for feature in geo_json_data['features']:
polygon = shapely.geometry.shape(feature['geometry'])
for point in points:
if polygon.contains(point):
point_complete = {'Longitude':point.x, 'Latitude':point.y, 'Community':feature.get('properties', {}).get('community')}
area_found.append(point_complete)
return area_found
area_found = populate_area(points, os.path.join(data_folder, 'chicago-community-boundaries.json'))
print('Total {0} point found matching Chicago sectors.'.format(len(area_found)))
area_found = pd.DataFrame(area_found)
inspections = inspections.merge(area_found,on=['Latitude','Longitude'], how='left')
print('There are {0} missing Community left in the restaurant dataset.'.format(inspections[inspections.Community.isnull()].shape[0]))
Since we have some missing values after populating the Community values from geojson, we will try to fix them manually. Let's see if there is some pattern in the missing data.
inspections[inspections.Community.isna()].Zip.unique()
It seems that the only missing data are from two Zip codes. By doing quick google search and looking at maps, we can see that 60666 refers to O'Hare International Airport and 60611 is Near North Side.
# Populate missing values
inspections.loc[((inspections.Community.isna()) & (inspections.Zip == '60666')), 'Community'] = "O'HARE INTERNATIONAL AIRPORT"
inspections.loc[((inspections.Community.isna()) & (inspections.Zip == '60611')), 'Community'] = "NEAR NORTH SIDE"
print('There are {0} missing Community left in the restaurant dataset.'.format(inspections[inspections.Community.isna()].shape[0]))
inspections.Community.unique()
We are also going to get the information about Districts for each restaurant. For that we created a mapping between Communities and Districts (source), which we load from csv file.
chicago_communities = pd.read_excel(os.path.join(data_folder, 'Chicago-Communities.xlsx'))
chicago_communities.head()
chicago_communities.Community = chicago_communities.Community.apply(lambda x: str.upper(x).strip())
inspections =inspections.merge(chicago_communities, on='Community', how='left')
print('There are {0} missing District left in the restaurant dataset.'.format(inspections[inspections.District.isna()].shape[0]))
Since our main motivation to get Community Areas and District for the inspections and restaurants is to be able to create map for visualizing results, we also needed the geojson file with district boundaries. Since we couldn't find it online, we created it on our own using the chicago-community-areas geojson file and by making unions of ares that belong to the same district. The source code for that can be found in notebooks\creating_district_boundaries.ipynb.
In our dataset violations detected during the inspections are noted in the Violations column. Each violation contains a code, general violation description (name/category of the violation) and comments added by the inspector. Since the violation codes uniquely identify a particular violation, we extract them and use them for our analysis.
print('There are {0} missing values in the Violations column'.format(inspections['Violations'].isna().sum()))
inspections.fillna(value={'Violations': ''}, inplace=True)
# Extract violation codes from textual list of violations
def extract_violation_codes(violation):
violations_list = list(map(lambda v: v.strip(), violation.split('|')))
violation_dots = [violation.find('.') for violation in violations_list]
# using set to get rid of multiple same category violations per inspection (to not disturb the distributions)
violation_codes = list(set([int(v[:idx]) for v, idx in zip(violations_list, violation_dots) if idx != -1]))
return violation_codes
# Add additional column with extracted codes
inspections['Violation Codes'] = inspections['Violations'].apply(extract_violation_codes)
What we also want to do in this part is categorize these violation reasons into bigger, more general groups. Categorizing violations into smaller number of groups would help us understand the highlevel reasons for failing the inspection better and would also help in creating visualizations easier to understand. That is why we manually grouped all violation codes into 5 categories (using the violation descriptions for understanding what each violation code means):
What is really important for us is that, as indicated in the linked document from the dataset page, the violation codes and descriptions have changed as of 1/07/2018. This means, that violation codes before and after this date don't correspond to the same violation reason. We first tried to map the changed values to the old ones, but it did not give results we expected (there are many new violations added and also the meaning of many violation changed). Therefore we'll usually need to split our analysis into two parts, so that violation codes in each part are consistent - analyze the inspections before 1/07/2018 and after this day. This is where general categorization will be useful because it will allow us to connect those two periods because we keep same 5 categories for all violations.
# Split the dataset regarding the change of violation definitions
violation_change_date = pd.Timestamp(year=2018, month=7, day=1)
inspections['Inspection Date'] = pd.to_datetime(inspections['Inspection Date'])
inspections_before_change = inspections[inspections['Inspection Date'] < violation_change_date]
inspections_after_change = inspections[inspections['Inspection Date'] >= violation_change_date]
# Custom grouping in categories for violations before change
food_codes_before = [1, 3, 15, 16, 17, 30]
facility_codes_before = [2, 7, 9, 11, 13, 18, 22, 23, 24, 25, 26, 29, 32, 35, 36, 37, 38, 40, 42, 43]
sanitary_codes_before = [4, 8, 10, 12, 19, 20, 27, 31, 33, 34, 39, 41]
staff_codes_before = [5, 6, 21, 44, 45]
unknown_codes_before = [14, 28, 70]
codes_before = [food_codes_before, facility_codes_before, sanitary_codes_before, staff_codes_before, unknown_codes_before]
# Custom grouping in categories for violations after change
food_codes_after = [11, 12, 13, 14, 15, 17, 23, 26, 27, 28, 30, 31, 37, 39, 42]
facility_codes_after = [10, 18, 19, 20, 21, 22, 33, 35, 36, 38, 41, 43, 44, 48, 50, 51, 53, 55, 56, 59, 60, 62]
sanitary_codes_after = [2, 8, 16, 40, 45, 46, 47, 49, 52, 54]
staff_codes_after = [1, 3, 7, 9, 25, 57, 58]
unknown_codes_after = [4, 5, 6, 24, 29, 32, 61, 63]
codes_after = [food_codes_after, facility_codes_after, sanitary_codes_after, staff_codes_after, unknown_codes_after]
# Based on the above array creates mappings between original codes and generalized categories
def create_mapping(codes):
mapping = {}
for new_code, category_codes in enumerate(codes):
for old_code in category_codes:
mapping[old_code] = new_code + 1 # new codes starting from 1
return mapping
# Function to transform array of original codes into our categories
def encode_violations(violations, mapping):
encoded = []
for violation in violations:
encoded.append(mapping[violation])
return encoded
# Create mappings for conversion
before_mapping = create_mapping(codes_before)
after_mapping = create_mapping(codes_after)
mapped_inspections_before_change = inspections_before_change['Violation Codes'].apply(encode_violations, mapping=before_mapping)
mapped_inspections_after_change = inspections_after_change['Violation Codes'].apply(encode_violations, mapping=after_mapping)
mapped_inspections = mapped_inspections_before_change.append(mapped_inspections_after_change)
# Add additional column with violation codes by our general categories
inspections = inspections.merge(mapped_inspections, left_index=True, right_index=True, suffixes=('', ' Generalized'))
inspections.head()
Not all the violations have the same weight, right? For instance, it is not the same to have dirty toilet sink and serve expired food. Therefore, we want to make a distinction between violations, not only based on the category, but based on how serous the violation was.
In order to differentiate restaurants and neighborhoods based on the severity of the violations they make, we need to know whether in some inspection there were some critical violations broken. For this, our analysis again differs before and after 1.7.2018.
In the official description of the dataset, which applies to violations before 1st of July 2018, it is stated that violations with code 1-14 are classified as critical violations, 15-29 are serious violations, and the rest are minor violations.
After the change of the violations in 2018, different renaming of critical, serious and minor violation was applied and they are called priority, priority foundation and core violation respectively. Moreover, there was no more clear separation which violation code belongs to which group. The way to deduct the severity of the violation in each inspection is to look at the comments made by inspectors. If there was a critical violation broken, it would be marked as priority violation.
We will use this information to check if there was any critical or serious violation for each inspection. We will create 2 new columns which will contain boolean values: Critical Violation Noticed and Serious Violation Noticed.
# Function that applies for inspections before change (1.7.2018) and checks if there is any critical violation among given violation codes
def is_critical_violation_before_change(violation_codes):
critical_violations = np.arange(1,15).tolist()
return (list(set(violation_codes) & set(critical_violations)) != [])
# Function that applies for inspections after change (1.7.2018) and checks if there is any critical violation among given violation codes
def is_critical_violation_after_change(comment):
return ('priority violation' in comment.lower())
# Function that checks if during the inspection it was observed at least one critical violation
def check_critical_violation(row):
change_date = datetime.datetime.strptime('2018-07-01', '%Y-%m-%d')
if((row['Inspection Date'] < change_date) and (is_critical_violation_before_change(row['Violation Codes']))):
return True
if((row['Inspection Date'] >= change_date) and (is_critical_violation_after_change(row['Violations']))):
return True
return False
# Create new column which shows if during the inspection it was observed at least one critical violation
inspections['Critical Violation Noticed'] = inspections.apply(lambda x : check_critical_violation(x), axis = 1)
Similarily, we will check serious violations. They are slighlty less dangerous than critical ones, but still considered harmful for human's health.
# Function that applies for inspections before change (1.7.2018) and checks if there is any serious violation among given violation codes
def is_serious_violation_before_change(violation_codes):
serious_violations = np.arange(15,30).tolist()
return (list(set(violation_codes) & set(serious_violations)) != [])
# Function that applies for inspections after change (1.7.2018) and checks if there is any critical violation among given violation codes
def is_serious_violation_after_change(comment):
return ('priority foundation' in comment.lower())
# Function that checks if during the inspection it was observed at least one serious violation
def check_serious_violation(row):
change_date = datetime.datetime.strptime('2018-07-01', '%Y-%m-%d')
if((row['Inspection Date'] < change_date) and (is_serious_violation_before_change(row['Violation Codes']))):
return True
if((row['Inspection Date'] >= change_date) and (is_serious_violation_after_change(row['Violations']))):
return True
return False
# Create new column which shows if during the inspection it was observed at least one serious violation
inspections['Serious Violation Noticed'] = inspections.apply(lambda x: check_serious_violation(x), axis=1)
inspections.head()
In this part we prepare some basic plots to get more visual insight in our dataset.
We start with number of inspections per year to see if we can observe any trend in that matter
data = inspections.groupby(inspections["Inspection Date"].dt.year)["Inspection ID"].count().to_frame().reset_index()
data = data.rename(columns={"Inspection ID": "Number of inspections"})
colorscale = ['#FDCB36']
fig = px.bar(data, x='Inspection Date', y='Number of inspections',barmode='group', template='plotly_white', color_discrete_sequence=colorscale)
fig.show()
#plot_offline(fig,"inspections_per_year.html")
Next we want to look into inspections results each year
data.head()
data = inspections.groupby([inspections["Inspection Date"].dt.year, inspections["Results"]])["Inspection ID"].count().to_frame().reset_index()
data = data.rename(columns={"Inspection ID": "Number of inspections"})
colorscale = ['#BE4986', '#FDCB36', '#ED7953', '#BE4987', '#7D356F', '#774AA8', '#1C3587']
fig = px.bar(data, x='Inspection Date', y='Number of inspections', color='Results', barmode='group', template='plotly_white', color_discrete_sequence=colorscale)
fig.show()
#plot_offline(fig,"ispections_per_year_per_type.html")
It seems that Chicago sanitary got more strict as we can see in 2018 and 2019 far less restaurants pass an inspection in favour of passing with conditions.
Before we proceed, using the information about critical and serious violations made during inspections, let's check how often do restaurants make those violations.
critical_violations_percentage = int(100*inspections[inspections['Critical Violation Noticed']].shape[0]/ inspections.shape[0])
print('{0}% of inspections had at least one critical violation.'.format(critical_violations_percentage))
serious_violations_percentage = int(100*inspections[inspections['Serious Violation Noticed']].shape[0]/ inspections.shape[0])
print('{0}% of inspections had at least one serious violation.'.format(serious_violations_percentage))
# Plot proportion of critical and serious violations in inspections
fig = plt.figure()
plt.suptitle('Percentage of inspections having critical or serious violations');
startingRadius=1
# First donut plot - Plotting proportion of critical violations
ax1 = fig.add_subplot(221)
critical_violation_donut = [100-critical_violations_percentage, critical_violations_percentage]
ax1.text(0.13, startingRadius + 0.17, "Critical violations: "+str(critical_violations_percentage)+"%",
horizontalalignment='center', verticalalignment='center')
ax1.pie(critical_violation_donut, radius=startingRadius, startangle=90, colors=['#fce0de', '#ed5b51'],
wedgeprops={"edgecolor": "white", 'linewidth': 1})
circle = plt.Circle(xy=(0, 0), radius=0.45, facecolor='white')
plt.gca().add_artist(circle)
# Second donut plot - Plotting proportion of serious violations
ax2 = fig.add_subplot(222)
ax2.text(0.13, startingRadius + 0.17, "Serious violations: "+str(serious_violations_percentage)+"%",
horizontalalignment='center', verticalalignment='center')
serious_violation_donut = [100-serious_violations_percentage, serious_violations_percentage]
ax2.pie(serious_violation_donut, radius=startingRadius, startangle=90, colors=['#fce0de', '#f0776e'],
wedgeprops={"edgecolor": "white", 'linewidth': 1})
circle = plt.Circle(xy=(0, 0), radius=0.45, facecolor='white')
plt.gca().add_artist(circle)
plt.show()
Every 1 out of four inspections found at least one serious violation. Considering the fact that both critical and serious violations are considered harmful to humans' health, these persentages are really not naive!
Below we present data analysis for each of the research questions we defined. These analysis leads to initial answers to these questions, we also describe one question we decided to drop (see Question 5). At the end of data analysis for each research question, we briefly describe what will/can be done with regards to the posed question.
What are the most common reasons for a restaurant not passing an inspection?
In this first question we want to explore the most common violations in the inspections that ended in failure. We will use Violation Codes and categories that we previously defined, and then we will do an analysis of most common violations for different communities. As a reminder, because of violation's changes, we will have different analysis for before and after the change.
# Focus only on failed inspections
failed_inspections_before = inspections_before_change[inspections_before_change['Results'] == 'Fail']
failed_inspections_after = inspections_after_change[inspections_after_change['Results'] == 'Fail']
Apart from grouping violations in 5 categories, we also named all of the validations, both before and after change. This is going to help us in better visualization of the violations.
violations_before_change = pd.read_excel(os.path.join(data_folder,'violation_names_before_2018.xlsx'))
violations_before_change.head()
Since we want to focus on the most common reasons for failing an inspection, we want to count how many times each violation occured in inspections that ended in failure.
# Create list of all violation codes found during failed inspections
all_violations_before_change_list = failed_inspections_before['Violation Codes'].values.tolist()
all_violations_before_change_list = [item for sublist in all_violations_before_change_list for item in sublist]
# Count occurencies of each validation
from collections import Counter
counted_violations_before_change = Counter(all_violations_before_change_list)
counted_violations_before_change = pd.DataFrame.from_dict(counted_violations_before_change, orient='index').reset_index()
violations_before_change = violations_before_change.merge(counted_violations_before_change, left_on = 'Violation code', right_on ='index').sort_values(by=0,ascending = True).rename(columns={0:'# violated'}).drop(columns='index')
violations_before_change.head()
Apart from getting the number of violation, we will enrich this data frame with three additional columns: Violation Category, Critical Violation and Serious Violation. Violation Category will be one of the previously mentioned 5 categories that we created. Critical Violation and Serious Violation are boolean valued columns saying whether violation is critical or serious, respectively. As we explained, validation codes 1-14 are critical violations, and 15-29 are serious violations. Note that we won't be able to create those two columns for the violations after change.
# Function that returns violation category for violations before 1.7.2018.
def get_violation_category_before_change(violationCode):
# Definiton of code mappings
food_codes_before = [1, 3, 15, 16, 17, 30]
facility_codes_before = [2, 7, 9, 11, 13, 18, 22, 23, 24, 25, 26, 29, 32, 35, 36, 37, 38, 40, 42, 43]
sanitary_codes_before = [4, 8, 10, 12, 19, 20, 27, 31, 33, 34, 39, 41]
staff_codes_before = [5, 6, 21, 44, 45]
unknown_codes_before = [14, 28, 70]
if (violationCode in food_codes_before):
return 'Food related violations'
if (violationCode in facility_codes_before):
return 'Facility related violations'
if (violationCode in sanitary_codes_before):
return 'Sanitary related violations'
if (violationCode in staff_codes_before):
return 'Staff related violations'
if (violationCode in unknown_codes_before):
return 'Other'
return None
# Create columns - Violation Category, Critical Violation & Serious Violation
violations_before_change['Violation Category'] = violations_before_change['Violation code'].apply(lambda x: get_violation_category_before_change(x))
violations_before_change['Critical Violation'] = violations_before_change['Violation code'].apply(lambda x: is_critical_violation_before_change([x]))
violations_before_change['Serious Violation'] = violations_before_change['Violation code'].apply(lambda x: is_serious_violation_before_change([x]))
violations_before_change = violations_before_change.sort_values('# violated', ascending= False)
violations_before_change.head()
Now that we prepared the data for the analysis, we first want to see which violations are the ones that are present in the biggest number of failed inspections. We will plot all the violations names and the corresponding number of violations made, as well as the category to which that violation belongs to.
# Visualising only 25 most frequent violations
top_violations_before_change = violations_before_change.iloc[:25, :].copy()
categories = ['Food related violations', 'Facility related violations','Sanitary related violations','Staff related violations','Other']
fig = go.Figure()
colorscale = [px.colors.sequential.Plasma[0],
px.colors.sequential.Plasma[2],
px.colors.sequential.Plasma[5],
px.colors.sequential.Plasma[7],
px.colors.sequential.Plasma[9]
]
for (category,color) in zip(categories,colorscale):
fig.add_trace(go.Bar(x=top_violations_before_change[top_violations_before_change['Violation Category']==category]['# violated'],
y=top_violations_before_change[top_violations_before_change['Violation Category']==category]['Violation name'],
name=category,
orientation='h',
hovertemplate='%{y}<br>Number of violations: %{x}',
# choose color palette
marker_color=color
))
fig.update_layout(
title={
'text': "Number of violations for failed inspection result",
#'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_tickfont_size=14,
yaxis=dict(
title='Violation name',
titlefont_size=20,
tickfont_size=12,
dtick= 1,
categoryorder="total ascending"
),
xaxis_title="Number of violations",
bargap=0.4,
legend=dict(
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
template="plotly_white"
#height = 1200,
#width = 1200
)
#plot_offline(fig,"violations_failed_before.html")
We have a winner! Among the failed inspections, dirty floor was mentioned the most as one of the violations. Two out of top three violations are related to cleaningness in the restaurant. Interestingly, many restaurants have problems with walls and ceiling constructions. Due to the fact that we created the categories by ourselves and that we are assigning each violation to strictly one category, there could be that some part of ceiling and wall problems is also due to dirtiness apart from construction.
It seems that "most popular violations" are sanitary and facility related. What suprises is that, even though we are analysing restaurants, there is only one food related violation among the top 10 and it is on 10th place.
Looking at the top violations in failing, the inadequate floor or ceiling maitenance does not seem so severe. Let's now focus only on critical violations and see what is the most frequent critical violation.
# Visualize critical violations
critical_before_change = violations_before_change[violations_before_change['Critical Violation']]
fig = go.Figure()
for (category,color) in zip(categories,colorscale):
fig.add_trace(go.Bar(x=critical_before_change[critical_before_change['Violation Category']==category]['# violated'],
y=critical_before_change[critical_before_change['Violation Category']==category]['Violation name'],
name=category,
orientation='h',
hovertemplate='%{y}<br>Number of violations: %{x}',
# choose color palette
marker_color=color
))
fig.update_layout(
title="Number of critical violations for failed inspection result",
xaxis_title="Number of violations",
yaxis_title="Critical violation name",
yaxis_dtick= 1,
# bargap=0.1,
yaxis_categoryorder = "total ascending",
template="plotly_white"
)
#plot_offline(fig,"critical_violations_before.html")
When looking at critical violations only, the top violation is related to proper food storage temperature. According to the full violation description, it means that potentially hazardous food meets temperature requirements during storage, preparation and service. On second and third place we have facility related violations, and then come the ones related to adequate cleaning of all necessary areas and equipement.
# Visualize serious violations
serious_before_change = violations_before_change[violations_before_change['Serious Violation']]
fig = go.Figure()
for (category,color) in zip(categories,colorscale):
fig.add_trace(go.Bar(x=serious_before_change[serious_before_change['Violation Category']==category]['# violated'],
y=serious_before_change[serious_before_change['Violation Category']==category]['Violation name'],
name=category,
orientation='h',
hovertemplate='%{y}<br>Number of violations: %{x}',
# choose color palette
marker_color=color
))
fig.update_layout(
title="Number of serious violations for failed inspection result",
xaxis_title="Number of violations",
yaxis_title="Serious violation name",
yaxis_dtick= 1,
bargap=0.2,
yaxis_categoryorder = "total ascending",
template="plotly_white"
)
#plot_offline(fig,"serious_violations_before.html")
Among the serious violations, the big problem for restaurants is that they do not have adequate protection from entrance of rodents and insects in the building. Also, many restaurants do not manage to correct serious violations they made, which also brings up the possibility to think about the performance of restaurants on re-inspections, which we will discuss later.
Now, we will do the same analysis for the violations after the change on 1.7.2018.
violations_after_change = pd.read_excel(os.path.join(data_folder,'violation_names_after_2018.xlsx'))
violations_after_change.head()
# Create list of all violation codes found during failed inspections
all_violations_after_change_list = failed_inspections_after['Violation Codes'].values.tolist()
all_violations_after_change_list = [item for sublist in all_violations_after_change_list for item in sublist]
# Count occurencies of each validation
counted_violations_after_change = Counter(all_violations_after_change_list)
counted_violations_after_change = pd.DataFrame.from_dict(counted_violations_after_change, orient='index').reset_index()
violations_after_change = violations_after_change.merge(counted_violations_after_change, left_on = 'Violation code', right_on ='index').sort_values(by=0,ascending = False).rename(columns={0:'# violated'}).drop(columns='index')
violations_after_change.head()
# Function that returns violation category for violations before 1.7.2018.
def get_violation_category_after_change(violationCode):
# Definiton of code mappings
food_codes_after = [11, 12, 13, 14, 15, 17, 23, 26, 27, 28, 30, 31, 37, 39, 42]
facility_codes_after = [10, 18, 19, 20, 21, 22, 33, 35, 36, 38, 41, 43, 44, 48, 50, 51, 53, 55, 56, 59, 60, 62]
sanitary_codes_after = [2, 8, 16, 40, 45, 46, 47, 49, 52, 54]
staff_codes_after = [1, 3, 7, 9, 25, 57, 58]
unknown_codes_after = [4, 5, 6, 24, 29, 32, 61, 63]
if (violationCode in food_codes_after):
return 'Food related violations'
if (violationCode in facility_codes_after):
return 'Facility related violations'
if (violationCode in sanitary_codes_after):
return 'Sanitary related violations'
if (violationCode in staff_codes_after):
return 'Staff related violations'
if (violationCode in unknown_codes_after):
return 'Other'
return None
# Create columns - Violation Category, Critical Violation & Serious Violation
violations_after_change['Violation Category'] = violations_after_change['Violation code'].apply(lambda x: get_violation_category_after_change(x))
violations_after_change.head()
# Visualising only 25 most frequent violations
top_violations_after_change = violations_after_change.iloc[0:25,:].copy()
categories = ['Food related violations', 'Facility related violations','Sanitary related violations','Staff related violations','Other']
fig = go.Figure()
colorscale = [px.colors.sequential.Plasma[0],
px.colors.sequential.Plasma[2],
px.colors.sequential.Plasma[5],
px.colors.sequential.Plasma[7],
px.colors.sequential.Plasma[9]
]
for (category,color) in zip(categories, colorscale):
fig.add_trace(go.Bar(x=top_violations_after_change[top_violations_after_change['Violation Category']==category]['# violated'],
y=top_violations_after_change[top_violations_after_change['Violation Category']==category]['Violation name'],
name=category,
orientation='h',
hovertemplate='%{y}<br>Number of violations: %{x}',
# marker=dict(
# color=px.colors.sequential.Plasma),
# marker={'colorscale': 'Plasma'}
# choose color palette
marker_color=color
)
)
fig.update_layout(
template='plotly_white',
title={
'text': "Number of violations for failed inspection result",
#'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
xaxis_tickfont_size=14,
yaxis=dict(
title='Violation name',
titlefont_size=16,
tickfont_size=12,
dtick= 1,
categoryorder="total ascending"
),
xaxis_title="Number of violations",
bargap=0.4
)
#plot_offline(fig,"violations_after_change.html")
We already mentioned the big differences and stricter criteria after the change date. That can also be noticed in the violations that are made the most. Now for the first time we can see a big impact of violations related to employees training. Apparently, many restaurants did not have time do adjust properly to the new regulations and failed in the past year and a half due to inadequate management or lack of allergen training for the staff. Also, the City of Chicago requires defined procedures for many situations that can happen in a restaurant, and one that is the most problematic for restaurants to comply with is reacting in case of customer sickness. So be careful and remind yourself of your first aid knowledge, because the restaurant employees apparently cannot help you properly. Of course, when looking at these reasons, we have to take into account that these changed violation list is valid for less than 18 months, so there is smaller amount of inspections which can be analyzed.
Below we present the visualizations of the most common violations, both for our generalized categories and original violation codes before and after the change of violations. Following functions are used for aggregations and extracting most common violations per community.
# Get violation codes list for each neighbourhood (zip code)
def violation_codes_by(data, by_column, violation_column):
return data.groupby(by=by_column)[violation_column].apply(merge_violation_codes)
# Find top most common violations in given violations list
def most_common_violations(violations, max_violation_code, top=1):
counts, _ = violation_counts(violations, max_violation_code)
return np.argsort(counts)[::-1][:top] + 1 # add one to convert to violation code from index
# Find top most common violations in each neighbourhood (zip code)
def most_common_violations_by(data, by_column, violation_column, max_violation_code, top=1):
return violation_codes_by(data, by_column, violation_column).apply(most_common_violations, max_violation_code=max_violation_code, top=top)
# Merges all violation codes from the column into one flat array
def merge_violation_codes(violation_series):
return [code for inspection_violation_codes in violation_series.values for code in inspection_violation_codes]
# Creates the histogram for violation codes
def violation_counts(violations, max_violation_code):
counts, code_bins = np.histogram(violations, bins=np.arange(1, max_violation_code + 2))
return counts, code_bins
# Creates violation codes distribution from the dataframe
def violations_distribution(df, violation_column='Violation Codes', max_violation_code=70):
# https://stackoverflow.com/a/38258158
all_codes = merge_violation_codes(df[violation_column])
counts, code_bins = violation_counts(all_codes, max_violation_code)
return code_bins[:-1], counts
# For each community get the most common violation regarding original violations and created categories as well as the change in violations
by = 'Community'
most_common_violations_by_community_before_change = most_common_violations_by(inspections_before_change, by, 'Violation Codes', 70, top=1)
most_common_violations_by_community_after_change = most_common_violations_by(inspections_after_change, by, 'Violation Codes', 63, top=1)
most_common_generalized_violations_by_community = most_common_violations_by(inspections, by, 'Violation Codes Generalized', 5, top=1)
# Comine most common violations into one dataframe
most_common = pd.DataFrame(most_common_generalized_violations_by_community) \
.rename(columns={'Violation Codes Generalized': 'general_most_common'}) \
.merge(most_common_violations_by_community_before_change, left_index=True, right_index=True) \
.rename(columns={'Violation Codes': 'before_most_common'}) \
.merge(most_common_violations_by_community_after_change, left_index=True, right_index=True) \
.rename(columns={'Violation Codes': 'after_most_common'}) \
.reset_index()
# Transform most common general violation to value from list
most_common = most_common.explode('general_most_common')
most_common['community_name'] = most_common['Community'] # for plotting purposes
# Transform violation codes into names
def transform_violation_codes_into_names(violation_codes, names_mapping_df):
names_array = [names_mapping_df[names_mapping_df['Violation code'] == code]['Violation name'].values for code in violation_codes]
if len(names_array) > 0:
return ', '.join([names[0] for names in names_array])
else:
return 'None'
most_common['before_most_common_names'] = most_common['before_most_common'].apply(transform_violation_codes_into_names, names_mapping_df=violations_before_change)
most_common['after_most_common_names'] = most_common['after_most_common'].apply(transform_violation_codes_into_names, names_mapping_df=violations_after_change)
# Create separate dataframes with limitted columns for ploting on maps
most_common_before = most_common.copy().drop(columns=['general_most_common', 'after_most_common', 'after_most_common_names']).explode('before_most_common')
most_common_after = most_common.copy().drop(columns=['general_most_common', 'before_most_common', 'before_most_common_names']).explode('after_most_common')
most_common.drop(columns=['before_most_common', 'after_most_common'], inplace=True)
# Reordering columns for plotting purposes (variables interpolation in template)
most_common = most_common[['Community', 'general_most_common', 'community_name', 'before_most_common_names', 'after_most_common_names']]
template = '<b>{}</b><br><b>Before 1/07/2018:</b> {}<br><b>After 1/07/2018:</b> {}'
figure = plot_map('Most common violations for Chicago communities',
geodata_path=os.path.join(data_folder, 'chicago-community-boundaries.json'),
dataframe=most_common,
key_property='community', key_column='Community', value_column='general_most_common',
template=template,
mapbox_access_token=MAPBOX_ACCESS_TOKEN,
options={}
)
figure.show()
#plot_offline(figure,"common_violations_community.html")
The above plot shows that for all of the communities (apart from only the one that is not included in our geojson file for which the data is not displayed) the most common violation belonged to the "Facility related" violations. Hovering over the cells one can also inspect the most common original violation for a particular community before and after the change of violations.
Below we do separte plots for most common violations before and after the change of violations.
Most common violations per neighborhood visualization (before 1/07/2018)
# https://community.plot.ly/t/colors-for-discrete-ranges-in-heatmaps/7780
# Creates a colorscale with discrete steps for plotting given start and end rgb tuples and number of discrete values
def make_discrete_colorscale(start, end, n_categories):
rs, gs, bs = start
re, ge, be = end
r, g, b = (re - rs)/(n_categories - 1), (ge - gs)/(n_categories - 1), (be - bs)/(n_categories - 1)
colors = []
for cat in range(n_categories):
for i in range(2):
colors.append([(cat + i) / n_categories, f'rgb({rs + cat*r}, {gs + cat*g}, {bs + cat*b})'])
return colors
# Map violation codes to ordered values starting from 0
def map_violations_to_discrete_ordered_values(dataframe, mapped_column):
original_values = list(dataframe[mapped_column].unique())
mapped_to = list(range(len(original_values)))
mapping = dict(zip(original_values, mapped_to)) # https://stackoverflow.com/a/209854
dataframe[mapped_column] = dataframe[mapped_column].map(mapping)
map_violations_to_discrete_ordered_values(most_common_before, 'before_most_common')
# Plot the map
n_categories = len(most_common_before['before_most_common'])
template = '<b>{}</b><br>{}'
figure = plot_map('Most common violations for Chicago communities before 1/07/2018',
geodata_path=os.path.join(data_folder, 'chicago-community-boundaries.json'),
dataframe=most_common_before,
key_property='community', key_column='Community', value_column='before_most_common',
template=template,
mapbox_access_token=MAPBOX_ACCESS_TOKEN,
options={
'colorscale': make_discrete_colorscale((200, 200, 255), (0, 0, 255), n_categories),
'showscale': False
})
figure.show()
#plot_offline(figure,"common_violations_community_before.html")
On the plot above one can identify, that most of the communities in Chicago have "Proper floor maintenance" as the most common violation. There is also a region of 7 communities in the soth-west part of Chicago for which "Clear equipment" was the most predominant violation in inspections.
Most common violations per neighborhood visualization (after 1/07/2018)
map_violations_to_discrete_ordered_values(most_common_after, 'after_most_common')
# Plot the map
n_categories = len(most_common_after['after_most_common'])
template = '<b>{}</b><br>{}'
figure = plot_map('Most common violations for Chicago communities after 1/07/2018',
geodata_path=os.path.join(data_folder, 'chicago-community-boundaries.json'),
dataframe=most_common_after,
key_property='community', key_column='Community', value_column='after_most_common',
template=template,
mapbox_access_token=MAPBOX_ACCESS_TOKEN,
options={
'colorscale': make_discrete_colorscale((200, 200, 255), (0, 0, 255), n_categories),
'showscale': False
})
figure.show()
#plot_offline(figure,"common_violations_community_after.html")
After the change of violations there are a lot of communities with "Procedures for reacting to customer sickness" as the predominant violated cirterion. There is also a big region of communities in the middle and south bay side area with "Adequate management" violation as the most common one. This violation relates to hierarchy and reporting among the personell of the restaurants.
Are there "safe to eat" areas or "dangerous to eat" areas in Chicago?
We decided to explore safety of the community areas using following criteria:
First four criteria we want to explore both by districts and by community areas. Therefore we first prepare dataframes with all necessary information for the analysis.
# Function that creates dataframe with all necessary columns for safe-danger analysis, grouping by District or Community Area
def create_df_for_group_analysis(groupingColumn):
group = inspections.copy().groupby(groupingColumn)
# Column for total number of inspections
df = group['Inspection ID'].count().rename('Total inspections').to_frame()
# Column for number of inspections with critical violations
df['# ins with critical violation'] = group.apply(lambda x: x[x['Critical Violation Noticed']]['Inspection ID'].count())
# Column for number of failed inspections
df['Failed inspections'] = group['Results'].apply(lambda x: (x=='Fail').sum())
# Column for number of failed re-inspections
re_inspections = inspections[inspections['Re-inspection']].copy()
df['Total re-inspections'] = re_inspections.groupby(groupingColumn).count()['Inspection ID']
df['Failed re-inspections']= re_inspections.groupby(groupingColumn)['Results'].apply(lambda x: (x=='Fail').sum())
# Column for number of complaint/suspected food poisoning inspections
complaint_inspections = inspections[(inspections['Inspection Type']=='Complaint') | (inspections['Inspection Type']=='Suspected Food Poisoning')].copy()
df['Complaint/SFP inspections'] = complaint_inspections.groupby(groupingColumn)['Inspection ID'].count()
# Column for number of restaurants
df['# restaurants'] = group['License #'].apply(set).apply(len)
df.reset_index(level=0,inplace=True)
return df
district_metrics = create_df_for_group_analysis('District')
community_metrics = create_df_for_group_analysis('Community')
district_metrics.isna().any()
community_metrics.isna().any()
# Check missing values
community_metrics[community_metrics['Complaint/SFP inspections'].isna()]
# Check if there were Complaints or Suspected Food Poisonings in OAKLAND
inspections[(((inspections['Inspection Type']=='Complaint') | (inspections['Inspection Type']=='Suspected Food Poisoning'))&(inspections['Community']=='OAKLAND'))]
# Since there were no complaints, fill with 0
community_metrics['Complaint/SFP inspections'] = community_metrics['Complaint/SFP inspections'].fillna(0)
community_metrics.isna().any()
# Function that enriches dataframe with parameters we want to analyze
def compute_parameters_for_group_analysis(df):
# Column for percentage of inspections with critical violations
df['% Critical'] = df['# ins with critical violation']/df['Total inspections']*100
# Column for percentage of failed inspections
df['% Failed'] = df['Failed inspections']/df['Total inspections']*100
# Column for percentage of failed re-inspections
df['% Re-Failed'] = df['Failed re-inspections']/df['Total re-inspections']*100
# Column for percentage of failed re-inspections
df['Average complaints/SFP'] = df['Complaint/SFP inspections']/df['# restaurants']
return df
district_metrics = compute_parameters_for_group_analysis(district_metrics)
district_metrics.head()
community_metrics = compute_parameters_for_group_analysis(community_metrics)
community_metrics.head()
# Function that plots map divided into districts of communities (grouped_by) by using value from value_column
def plot_by_criteria(grouped_by, df, key_column, value_column, chart_name, options):
# Drop unused columns
columns = df.columns.values
used_columns = set([key_column, value_column])
dropped_columns = [column for column in columns if column not in used_columns]
dataframe = df.drop(columns=dropped_columns)
# Duplicate key and value columns for plotting with interpolated variables on hover
dataframe[grouped_by] = dataframe[key_column]
dataframe['value'] = dataframe[value_column]
# Plot the map
geodata_filename = f'chicago-{grouped_by}-boundaries.json' # TODO: maybe rename the communities file
return plot_map(title=chart_name, geodata_path=os.path.join(data_folder, geodata_filename),
dataframe=dataframe, key_property=grouped_by, key_column=key_column,
value_column=value_column,
template='<b>{}</b>: {}',
mapbox_access_token=MAPBOX_ACCESS_TOKEN,
options=options)
As we previously discussed, not all violations are the same, and the most severe ones are critical violations. Therefore, if there are many inspections in which they were discovered, the area is considered more dangerous.
options = {
'colorscale': 'Reds',
'colorbar': {
'title': 'Percentage',
'ticksuffix': '%'
}
}
figure = plot_by_criteria('district', district_metrics, 'District', '% Critical', 'Percentage of inspections with critical violations per district', options)
figure.show()
#plot_offline(figure,"sd_critical_district.html")
By looking at districts, we can see that South Side restaurants make the most critical violations, where Northwest side is the safest district considering this criteria.
figure = plot_by_criteria('community', community_metrics, 'Community', '% Critical', 'Percentage of inspections with critical violations per community', options)
figure.show()
#plot_offline(figure,"sd_critical_area.html")
If we take a closer look, Washington park restaurants on average make critical violation every fourth inspection. Oakland in South Side and Uptown in Far North Side are also highly ranked and can get a title of dangerous areas.
If the proportion of failures among total inspections is higher for some areas then for others, we consider them more dangerous areas.
figure = plot_by_criteria('district', district_metrics, 'District', '% Failed','Percentage of failed inspections per district', options)
figure.show()
#plot_offline(figure,"sd_failures_district.html")
South Side is really leading in number of failures. Almost every 4th inspection ended in failure! City centre seems to have the least problems with failures.
figure = plot_by_criteria('community', community_metrics, 'Community', '% Failed','Percentage of failed inspections per community area', options)
figure.show()
#plot_offline(figure,"sd_failures_area.html")
We can now clearly see that the source for such bad results of South Side are Oakland and Washington Park. Every third inspection in these community areas is a failed one. City center and Airports are pretty safe, as well as community areas in Far Southeast side.
figure = plot_by_criteria('district', district_metrics, 'District', '% Re-Failed','Percentage of failed re-inspections per district', options)
figure.show()
#plot_offline(figure,"sd_reinspection_failed_district.html")
Again, worst performing district is South Side.
figure = plot_by_criteria('community', community_metrics, 'Community', '% Re-Failed', 'Percentage of failed re-inspections per community area', options)
figure.show()
#plot_offline(figure,"sd_reinspection_failed_district.html")
With 40% of failed-reinspections, Oakland deffinitely deserves to be avoided when choosing restaurants.
options = {
'colorscale': 'Reds',
'colorbar': {
'title': 'Avg. no. of complaints'
}
}
figure = plot_by_criteria('district', district_metrics,'District', 'Average complaints/SFP',' Average number of complaint or suspected food poisioning inspections per restaurant per district', options)
figure.show()
#plot_offline(figure,"sd_complaint_district.html")
We can clearly see that South is deffinitely more problematic than North.
figure = plot_by_criteria('community', community_metrics, 'Community', 'Average complaints/SFP',' Average number of complaint or suspected food poisioning inspections per restaurant per community area', options)
figure.show()
#plot_offline(figure,"sd_complaint_community.html")
Chatham and Calumet Heights have on average around 3.5 complaints per restaurant. What is really interesting that Oakland, previously considered as food-dangerous area, didn't have any complaint inspection in the previous ten years!
As a last criteria for measuring whether neighborhood is "safe to eat" or "dangerous to eat", we will take into account the aggregate results of inspections for the restaurants in that area.
# Get contigency table
inspections_scores_by_area = inspections.groupby(['Community','Results']).size().unstack('Results', fill_value=0)
inspections_scores_by_area.head()
We decided to create a formula which will give the overall score for each neighborhood, taking into account the number of inspections per each category, as well as penalizing the negative results and awarding the positive ones. If the restaurant was suspended, which means Out of Business or Business Not Located (according to the docs), that gives -2 point. If the restaurant failed, it will be -1 points. If it passes with condition, the score is 0.5 points. If the restaurant passed clearly, it gives 1 point. We then divide that by the total amount of restaurants in the neighborhood.
# Function that gives the safety score for the neighborhood
def get_safety_score(df):
score = (-2)*df['Business Not Located']+(-2)*df['Out of Business']+(-1)*df['Fail']+0.5*df['Pass w/ Conditions']+1*df['Pass']
number_of_inspections = df.sum(axis=1)
return round(score/number_of_inspections, 2)
# Add safety score to DF
inspections_scores_by_area['Safety score'] = pd.Series(get_safety_score(inspections_scores_by_area))
inspections_scores_by_area.sort_values('Safety score', ascending=False).head()
Now, we want to visualise the results in order to see if there is any pattern.
inspections_scores_by_area.reset_index(level=0,inplace=True)
# Visualise neighorhoods with respect to the Safety Score they got
colorscale1 = px.colors.sequential.YlOrRd
colorscale1.reverse()
options = {
'colorscale': colorscale1,
'colorbar': {
'title': 'Safety score'
}
}
figure = plot_by_criteria('community', inspections_scores_by_area, 'Community', 'Safety score', 'Safety scores by community areas', options)
figure.show()
#plot_offline(figure,"sd_safety_score_area.html")
According to the safety scores we calculated, it seems that the safest places to eat are the the two Airports and the City Center. That's right, you may need to pay extra, since those are central tourist points, but you can be quite sure that the restaurant you are eating is fulfulling the regulations. As you are going further from this places, the risk increases.
Checking restaurants history of inspections, how are restaurants or whole areas changing in their inspection scores? Can one see any patterns in improvements with respect to inspection results? Are there areas/restaurant chains/restaurant types that follow some trends?
For the analysis of the history of inspections, we will use only the three most frequent results: Pass, Pass w/ conditions and Fail.
As a first indicator of restaurant quality, we will use the number of safety violations per inspection.
inspections["nb_violations"] = inspections.apply(lambda row: len(row['Violation Codes']), axis=1)
inspections.sample(5)
Let's compare the number of violations for inspections with results "Pass", "Pass w/ conditions" and "Fail" :
inspections[inspections.Results == "Pass"].nb_violations.describe()
inspections[inspections.Results == "Fail"].nb_violations.describe()
As expected, the average number of violations is higher for restaurants who failed their inspection. However, an important detail shouldn't be overlooked: quartiles show that some failing restaurants have less violations than passing ones. We will therefore consider two parameters to evaluate how much restaurants improve over time: the difference between the intial average number of violations and the same average number at the last inspection, and their inspection result (Pass, Pass w/ conditions and Fail).
def nvdiff(group):
tmp = group.sort_values(by="Inspection Date")
return (tmp.iloc[-1].nb_violations-tmp.iloc[0].nb_violations)
violations_evo = inspections.groupby("License #").apply(nvdiff)
violations_evo = pd.DataFrame({'License #':violations_evo.index, 'Violations evolution':violations_evo.values})
violations_evo.sample(5)
We can now compute the average evolution score per community area (which is the average of the difference between the number of violations at first and last inspections for all restaurants of a given area).
violations_evo_community = violations_evo.merge(inspections[['License #','Community']], on='License #', how='left')\
.groupby('Community')['Violations evolution'].mean()
violations_evo_community = pd.DataFrame({'Community': violations_evo_community.index, 'evolution': violations_evo_community.values})
violations_evo_community.sample(5)
options = {
'colorscale': 'Reds',
'colorbar': {
'title': 'Evolution score'
}
}
figure = plot_by_criteria('community', violations_evo_community, 'Community', 'evolution', 'Violations evolution score by community', options)
figure.show()
#plot_offline(figure,"violations_evolution_community.html")
We can see that in most areas, restaurants manage to decrease their number of violations over time. But in a few areas, restaurants have increased their number of violations.
We can also visualize the number of restaurants who have improved, got worse or stagnated over time.
bins = [-np.inf,-0.1,0.1,np.inf]
cats = ["Improved", "Stagnated", "Worsened"]
pd.cut(violations_evo["Violations evolution"], bins, labels=cats).value_counts().plot.bar()
Most restaurants have improved or stagnated over time in their number of violations. The number of restaurants which got worse over time is smaller than the two other categories.
We will now compare the amount of restaurants who passed their most recent inspection in 2010 and 2018, per area. This will give us some insight in restaurant safety evolution over time.
# Number of inspections done in 2010
inspections[(inspections['Inspection Date'] >= '2010-01-01') & (inspections['Inspection Date'] < '2011-01-01')]['License #'].nunique()
# Number of inspections done in 2018
inspections[(inspections['Inspection Date'] >= '2018-01-01') & (inspections['Inspection Date'] < '2019-01-01')]['License #'].nunique()
# Selecting inspections done in 2010 and 2018
chicago_2010 = inspections[(inspections['Inspection Date'] >= '2010-01-01') & (inspections['Inspection Date'] < '2011-01-01')].copy()
chicago_2018 = inspections[(inspections['Inspection Date'] >= '2018-01-01') & (inspections['Inspection Date'] < '2019-01-01')].copy()
chicago_2010["Results"] = chicago_2010["Results"].apply(lambda x : x.startswith("Pass"))
chicago_2018["Results"] = chicago_2018["Results"].apply(lambda x : x.startswith("Pass"))
# Computing number of restaurants which passed their last inspection per community
pass_by_community2010 = chicago_2010.groupby(["License #", "Community"])\
.apply(lambda g : g.sort_values(by="Inspection Date").iloc[-1].Results)\
.groupby("Community")\
.sum()
pass_by_community2018 = chicago_2018.groupby(["License #", "Community"])\
.apply(lambda g : g.sort_values(by="Inspection Date").iloc[-1].Results)\
.groupby("Community")\
.sum()
# Converting counts to int
pass_by_community2010 = pd.DataFrame({"Community": pass_by_community2010.index, "Pass": pass_by_community2010.values})
pass_by_community2018 = pd.DataFrame({"Community": pass_by_community2018.index, "Pass": pass_by_community2018.values})
pass_by_community2010.Pass = pass_by_community2010.Pass.apply(lambda x : int(x))
pass_by_community2018.Pass = pass_by_community2018.Pass.apply(lambda x : int(x))
As not all areas have the same amount of restaurants, for each area we need to divide the number of restaurants passing inspections by the number of restaurants in the area to get a percentage.
community_count_2010 = chicago_2010.groupby("Community").apply(lambda g : g["License #"].nunique())
community_count_2018 = chicago_2018.groupby("Community").apply(lambda g : g["License #"].nunique())
community_count_2010 = pass_by_community2010.merge(pd.DataFrame({"Community": community_count_2010.index, "Count": community_count_2010.values}),on='Community', how='left')
community_count_2018 = pass_by_community2018.merge(pd.DataFrame({"Community": community_count_2018.index, "Count": community_count_2018.values}),on='Community', how='left')
# Computing pass percentages
community_count_2010.Pass = community_count_2010.apply(lambda row : row.Pass/row.Count, axis=1)
community_count_2018.Pass = community_count_2018.apply(lambda row : row.Pass/row.Count, axis=1)
community_count_2010 = community_count_2010.drop("Count",axis=1)
community_count_2018 = community_count_2018.drop("Count",axis=1)
# Convert from 0 to 1 floats to percentages and round it
def convert_to_rounded_percentages(dataframe, column):
dataframe[column] = dataframe[column].apply(lambda x: round(100 * x, ndigits=1))
convert_to_rounded_percentages(community_count_2010, 'Pass')
convert_to_rounded_percentages(community_count_2018, 'Pass')
We can now plot the percentage of valid restaurants per area in 2010...
options = {
'colorscale': [
[0.0, 'rgb(255, 0, 0)'],
[1.0, 'rgb(255, 255, 0)']
],
'colorbar': {
'title': 'Percentage',
'ticksuffix': '%'
}
}
figure = plot_by_criteria('community', community_count_2010, 'Community', 'Pass', 'Percentage of passing restaurants in 2010', options)
figure.show()
#plot_offline(figure,"percentage_passing_2010.html")
... and in 2018 :
figure = plot_by_criteria('community', community_count_2018, 'Community', 'Pass', 'Percentage of passing restaurants in 2018', options)
figure.show()
#plot_offline(figure,"percentage_passing_2018.html")
Overall, it seems that the percentage of restaurants passing inspections has decreased between 2010 and 2018 (which shows that the number of safety violations and and inspection results are not related, as we saw previously that most areas have decreased their number of violations over time). We can notice that the lowest percentage is lower in 2018 compared to 2010.
We can now look into the evolution of risk levels over time. We define the risk level of a community area as the average of the risk levels of all restaurants in that area, as determined by their latest inspection. We will compare average risk levels between 2010 and 2018.
chicago_2018.Risk.unique()
chicago_2018[chicago_2018.Risk == "All"]
We can see that two restaurants have the string "All" in their Risk column instead of a risk level. As these two restaurants have just been granted their license and have no inspection results, we will consider their risk level to be maximal (Risk 1 - High).
chicago_2018.replace({'Risk':{'All':'Risk 1 (High)'}},inplace=True)
chicago_2010 = chicago_2010[~chicago_2010.Risk.isna()]
chicago_2010.loc[:,'Risk'] = chicago_2010.Risk.apply(lambda r : int(r.split(" ")[1]))
chicago_2010.sample(5)
chicago_2018 = chicago_2018[~chicago_2018.Risk.isna()]
chicago_2018.loc[:,'Risk'] = chicago_2018.Risk.apply(lambda r : int(r.split(" ")[1]))
# Computing average risk level per area
risk_by_com2010 = chicago_2010.groupby(["License #","Community"])\
.apply(lambda g : g.sort_values(by="Inspection Date").iloc[-1].Risk)\
.groupby("Community")\
.mean()
risk_by_com2018 = chicago_2018.groupby(["License #","Community"])\
.apply(lambda g : g.sort_values(by="Inspection Date").iloc[-1].Risk)\
.groupby("Community")\
.mean()
risk_by_com2010
risk_by_com2010 = pd.DataFrame({"Community":risk_by_com2010.index,"Mean risk": risk_by_com2010.values})
risk_by_com2018 = pd.DataFrame({"Community":risk_by_com2018.index,"Mean risk": risk_by_com2018.values})
We can now visualize the average risk level per area in 2010 :
options = {
'colorscale': [
[0.0, 'rgb(255, 0, 0)'],
[1.0, 'rgb(255, 255, 0)']
],
'colorbar': {
'title': 'Risk level'
}
}
figure = plot_by_criteria('community', risk_by_com2010, 'Community', 'Mean risk',\
'Mean risk level per area in 2010', options)
#plot_offline(figure,"mean_risk_area_2010.html")
figure.show()
And in 2018 :
options = {
'colorscale': [
[0.0, 'rgb(255, 0, 0)'],
[1.0, 'rgb(255, 255, 0)']
],
'colorbar': {
'title': 'Risk level'
}
}
figure = plot_by_criteria('community', risk_by_com2018, 'Community', 'Mean risk',\
'Mean risk level per area in 2018', options)
#plot_offline(figure,"mean_risk_area_2018.html")
figure.show()
It seems that risk level averages have decreased, which means that restaurants are overall considered less safe.
Let's now have at look at how the performance of famous restaurant chains changes over time.
chain_counts = inspections[["License #","AKA Name"]].drop_duplicates().groupby("AKA Name")\
.agg('count').rename(columns={'License #':'count'}).sort_values(by="count",ascending=False)
chain_counts.head(30)
We will study some of the most popular restaurant chains in America :
Unfortunately, restaurant names in the dataset contain different spellings, and some also contain numbers. To identify restaurant chains, we need to find a substring for each chain name that is present only in the names of restaurants belonging to that chain.
# Filtering McDonalds
chicago_clean_names = inspections[~inspections["AKA Name"].isna()]
chicago_clean_names[(chicago_clean_names["AKA Name"].str.contains("Donald",case=False)) & ~
(chicago_clean_names["AKA Name"].str.contains("dogs",case=False))]["AKA Name"].unique()
We can see that all McDonalds restaurants, despite different names, all contain "Donald" in their names; hence we will use "Donald" to identify McDonalds.
# Filtering Dunkin Donuts
chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("dunkin donuts",case=False)]["AKA Name"].unique()
# Filtering Subway
chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("Subway",case=False)]["AKA Name"].unique()
# Filtering KFC
chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("KFC",case=False) | \
chicago_clean_names["AKA Name"].str.contains("Kentucky",case=False)]["AKA Name"].unique()
# Filtering Burger King
chicago_clean_names[(chicago_clean_names["AKA Name"].str.contains("burger king",case=False)) & ~
(chicago_clean_names["AKA Name"].str.contains("hamburger",case=False))]["AKA Name"].unique()
# Filtering Pizza Hut
chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("pizza hut",case=False)]["AKA Name"].unique()
# Filtering Taco Bell
chicago_clean_names[(chicago_clean_names["AKA Name"].str.contains("taco",case=False)) & (chicago_clean_names["AKA Name"].str.contains("bell",case=False))]["AKA Name"].unique()
# Filtering Starbucks
chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("starbucks",case=False)]["AKA Name"].unique()
def avg_violations_chain(df) :
return [df[df["Inspection Date"].dt.year == year][["License #","nb_violations"]]\
.groupby("License #").agg("mean").mean() for year in range(2010,2020)]
mcdo = chicago_clean_names[(chicago_clean_names["AKA Name"].str.contains("Donald",case=False)) & ~
(chicago_clean_names["AKA Name"].str.contains("dogs",case=False))]
subway = chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("Subway",case=False)]
ddonuts = chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("dunkin donuts",case=False)]
kfc = chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("KFC",case=False) | \
chicago_clean_names["AKA Name"].str.contains("Kentucky",case=False)]
burgerk = chicago_clean_names[(chicago_clean_names["AKA Name"].str.contains("burger king",case=False)) & ~
(chicago_clean_names["AKA Name"].str.contains("hamburger",case=False))]
pizza_hut = chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("pizza hut",case=False)]
taco_bell = chicago_clean_names[(chicago_clean_names["AKA Name"].str.contains("taco",case=False)) & (chicago_clean_names["AKA Name"].str.contains("bell",case=False))]
starbucks = chicago_clean_names[chicago_clean_names["AKA Name"].str.contains("starbucks",case=False)]
chains_df = pd.DataFrame({"Chain":["McDonald's","Subway","Dunkin Donuts","KFC","Pizza Hut",\
"Taco Bell","Starbucks"],"Year":[list(range(2010,2020)) for i in \
range(7)],"avg_violation":
[[y.values[0] for y in avg_violations_chain(x)] for x in [mcdo,subway,ddonuts,kfc,pizza_hut,taco_bell,\
starbucks]]})
chains_df = chains_df.set_index(['Chain']).apply(pd.Series.explode).reset_index()
chains_df = chains_df.rename(columns={"avg_violation":"Average number of violations"})
We can now plot the average number of violations per inspection, for each year, for the restaurant chains we selected :
import plotly.graph_objects as go
fig = go.Figure()
for chain in chains_df.Chain.unique() :
fig.add_trace(go.Scatter(
x = chains_df[chains_df.Chain == chain].Year,
y = chains_df[chains_df.Chain == chain]["Average number of violations"],
mode = 'lines+markers',
name = chain,
hovertemplate =
f'<b>{chain}</b>' +
'<br><i>Year</i>: %{x}'+
'<br><b>Average number of violations</b>: %{y:.2f}<br>'
))
fig.update_layout(hovermode='x')
fig.show()
#plot_offline(fig,"average_violation.html")
The restaurant chains with the lowest average number of violations, are (in chronological order) Subway, Starbucks, Subway, Starbucks, Taco Bell, KFC then Taco Bell. What about their average inspection passing rates ?
chains_pass_df = pd.DataFrame({"Chain":["McDonald's","Subway","Dunkin Donuts","KFC","Pizza Hut",\
"Taco Bell","Starbucks"],"Inspection pass rate":
[(1-x[x.Results == "Fail"].shape[0]/x.shape[0]) for x in [mcdo,subway,ddonuts,kfc,pizza_hut,taco_bell,\
starbucks]]})
fig = px.bar(chains_pass_df,x="Chain",y="Inspection pass rate",color="Inspection pass rate")\
.for_each_trace(lambda t: t.update(name=t.name.replace("Chain=","")))
fig.show()
#plot_offline(figure,"inspections_pass_rate.html")
The restaurant chain with the lowest passing rate is McDonald's, and the highest rate belongs to Starbucks.
How is restaurant performance in terms of inpection results related to geodemographic charactestics of the area (e.g. Life Quality Index)?
lqi_zipcode = pd.read_csv(os.path.join(data_folder, "lqi_indexes.csv"), index_col=False)
lqi_zipcode.head()
# Cast Zip code values to string
lqi_zipcode.zip_code = lqi_zipcode.zip_code.apply(lambda x: str(x))
lqi_zipcode.nunique()
inspections.Zip.nunique()
There is one zip code missing in our life quality index, it corresponds to Chicago airport (which makes sense as the airport is not the area where people live). In our analysis we will therefore use it for all of the other neighborhoods.
# Drop column before plotting
lqi_zipcode.drop(columns='Unnamed: 0', inplace=True)
options = {
'colorscale':'Plasma',
'colorbar': {
'title': 'LQI'
}
}
figure = plot_by_criteria('zip', lqi_zipcode, 'zip_code', 'life_quality_index', 'Life Quality Index by neighborhood', options)
figure.show()
#plot_offline(figure,"life_quality_index.html")
Life quality index in Chicago is the highest in bay area and goes down as we move away, generally we can say that south part of the city has worse life quality than the north part.
This actually matches the insights we got when analysing safe and dangerous areas in Chicago. Although we had many criteria for assessing how safe or dangerous it is to eat out in a particular area, in most of them South performed worse than North.
How are inspection results connected to customer reviews? Are the best scored restaurants the ones with the best inspection results as well? Which client-reported issues are also noticed by inspections? Which issues are only discovered by inspections?
We decided to use Google Places API to retrieve information about restaurant ratings. We queried API by DBA Name value and Longitude and Latitude of a restaurant. Sometimes however, API responses didn't match the area we were concerned about, meaning that they returned a restaurant with similar name but not in Chicago. Some restaurants couldn't be retrieved at all. To be sure we analyze API response correctly we start by checking if restuarants for which we have ratings are the same ones as we queried for. We perform it by checking names similarity and addresses.
restaurants.csv is a file which contains place_id (returned by Google API), restaurant name, address, zip code, latitude and longitude from Chicago Inspections dataset. We will use this csv to compare with what was retrieved from Google API.
google_places_path = os.path.join(data_folder, "google_places.csv")
restaurants_path = os.path.join(data_folder, "restaurants.csv")
google_places = pd.read_csv(google_places_path)
restaurants = pd.read_csv(restaurants_path)
google_places.head()
restaurants.head()
Let's see if all restaurants we got from Google are the ones we asked for and are located in Chicago:
We noticed that address from Chicago database has whitespace at the end so we remove it from each row:
restaurants.address = restaurants.address.apply(str.strip)
print(f"Google places API returned {len(google_places)} restaurants")
Check if all restaurants are in Chicago:
not_chicago_restaurants = google_places[~(google_places.city == "chicago")]
print(f"Number of restaurants with non-chicago city {len(not_chicago_restaurants)}")
not_chicago_restaurants.head(10)
We can see that sometimes the zip code was returned as a city, Chicago zip codes start with 60 therefore those entries where city is equal to a number starting with 60 are probably in Chicago and their city name is in the address column
address_chicago_count = len(google_places[google_places.address == "chicago"])
print(f"Number of places with chicago as address and zip code in city column is {address_chicago_count}")
google_places[google_places.city.str.startswith("60", na=False)]
If zip code starts with 60 move it to zip code column
chicago_idx = google_places.city.str.startswith("60", na=False)
google_places.loc[chicago_idx, 'zip_code'] = google_places['city']
google_places[google_places.city.str.startswith("60", na=False)]
If address is Chicago move it to city column
address_idx = (google_places.address == "chicago")
google_places.loc[address_idx, "city"] = "chicago"
google_places.loc[address_idx, "address"] = np.nan
not_chicago_restaurants = google_places[~(google_places.city == "chicago")]
print(f"Number of restaurants with non-chicago city {len(not_chicago_restaurants)}")
We can see that sometimes there are still cities denoted with zip codes starting with 60, let's check city for those values
google_places[google_places.city.str.startswith("60", na=False)]
It turns out that entries as cicero, north riverside, schaumburg or elgin are within Chicago aglomeration they are smaller towns or districts.
google_places[(~(google_places.zip_code.str.startswith("60", na=False)) & (google_places.city != "chicago"))]
We have 1071 rows which correspond to restaurants in different places than Chicago area. They are of no use to us so we remove them.
google_places_cleaned = google_places[((google_places.zip_code.str.startswith("60", na=False)) | (google_places.city == "chicago"))]
google_places_cleaned.head()
We will remove any duplicating rows from both dataframes as sometimes Google API might have returned same place_id for two distinct restaurants
pd.set_option('mode.chained_assignment', None)
google_places_cleaned.drop_duplicates(inplace=True)
restaurants.drop_duplicates(inplace=True)
Now we want to compare addresses and names of restaurants. We will start with joining Google places API output dataframe with restaurants dataframe on place_id. We perform left join on google_places_cleaned as we are only interested in cases when we received a response from Google API.
merged = google_places_cleaned.merge(restaurants, on="place_id", how="left", suffixes=("_google", "_restaurants"))
len(merged)
merged.head()
Sometimes we got a response from Google but we didn't get a rating, we drop rows where we couldn't retrieve rating from Google API.
merged[merged.rating.isna()]
merged.dropna(subset=["rating"],axis=0, inplace=True)
To check if for sure we have correct entries we will compare zip codes. If no zip_code is available from restaurants then we will compare names. First step would be to check where zip_code_restaurants is NaN
and second would be to remove second zip code component from zip_code_google
merged[merged.zip_code_restaurants.isna()]
We have only a couple of rows where zip_code_restaurants is not available, we can review them manually. It turns out that we will have to remove row 941, 11062 and McCormick place which is not a restaurant but a business center in Chicago.
merged.drop(axis=0, index=[941, 11062, 11155, 8756], inplace=True)
Now we will remove second zip_code_google component
Filling nans with string value to remove second zip_code_google component
merged.zip_code_google.fillna("nan", inplace=True)
merged.zip_code_google.str.len().unique()
merged.zip_code_google = merged.zip_code_google.apply(lambda x: x[:5])
merged.zip_code_google = merged.zip_code_google.replace("nan", np.nan)
merged.zip_code_google = merged.zip_code_google.apply(float)
Now we will take only those entries where every place has a matching zip code
matching = merged[merged.zip_code_google == merged.zip_code_restaurants]
At last to be sure that we have rating corresponding to a correct place we will compare names using python difflib. From docs "ratio() returns a float in [0, 1], measuring the similarity of the sequences. As a rule of thumb, a ratio() value over 0.6 means the sequences are close matches". We will use this property to discard any rows where name similarity is lower than 0.6. This step is necessary as sometimes name from Chicago database doesn't exactly match a name returned by Google API for instance in Chicago database we have: "yolk - test kitchen" while Google's response name is "yolk test kitchen"
from difflib import SequenceMatcher
def get_similarity_score(restaurant_name, google_name):
return SequenceMatcher(lambda x: x == " ",
restaurant_name,
google_name).ratio()
matching['name_similarity_score'] = matching.apply(lambda row: get_similarity_score(row["place_name_restaurants"], row["place_name_google"]), axis=1)
matching = matching[matching.name_similarity_score > 0.6]
As a last step we remove rows with duplicated place_id
matching = matching.drop_duplicates(subset=["place_id"])
len(matching)
we have ratings for 5700 restaurants
We will transform matching dataframe to use it for further analysis, we will remove useless columns and save it to csv file
matching.drop(["name_similarity_score", "place_name_google", "price_level", "address_google", "zip_code_google", "city"], axis=1, inplace=True)
matching.rename(columns={"place_name_restaurants": "place_name", "address_restaurants": "address", "zip_code_restaurants": "zip_code"}, inplace=True)
matching.total_number_of_ratings.describe()
sns.set(rc={'figure.figsize':(12,8)})
sns.distplot(matching.total_number_of_ratings)
matching.total_number_of_ratings.mode()
As number of ratings seem to follow a power law and most of restaurants have low number of ratings it would be very biased to compare them with restaurants that have tenths of thousands of ratings. Therefore we decide to drop this research question.